## (c) Ramya Parthasarathy

## This file cleans and pre-processes village assembly transcripts collected as part of an impact evaluation of the Pudhu Vaazhvu Project, a poverty alleviation and livelihoods program implemented by the World Bank and Government of Tamil Nadu. 

## Village assembly proceedings were reorded in a matched sample of 50 treatment and 50 control villages in on Republic Day, in January 2014. Recordings were then transcribed into Tamil, and then manually translated into English by a team from our survey firm.

## Each transcript begins with the geographic identifiers (district, block, village name), and the project status of the village. Each speaker is identified by his or her gender, and official position (where applicable). 

#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
print 'hello world'

## import packages
import re, os, nltk, collections, random, csv, urllib2, codecs, string, io, pandas, pprint

from nltk import word_tokenize

from nltk.util import ngrams
from nltk.util import bigrams
from nltk.util import trigrams

from collections import Counter

try:  
    import cPickle as pickle  
except:  
    import pickle  


#import stemmers
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()

from nltk.stem import PorterStemmer
pt = PorterStemmer()

from nltk.stem.snowball import EnglishStemmer
sb = EnglishStemmer()

from nltk.stem.wordnet import WordNetLemmatizer
wn = WordNetLemmatizer()


# import stopwords
from nltk.corpus import stopwords
stopwords.words('english')

## Specify the working directory
wd = "/Users/rmparthasarathy/Dropbox/wb-pvp/GS_Stata/Transcript_Analysis/Cleaned"
os.chdir(wd)


#-------------------------------------------------------------------------------------------------------------------------------------------------

## Set directory for all trancript files
transcripts = os.listdir("/Users/rmparthasarathy/Dropbox/wb-pvp/GS_Stata/Transcript_Analysis/Cleaned")

#-------------------------------------------------------------------------------------------------------------------------------------------------

## SAMPLE TRANSCRIPT
#f = io.open(transcripts[35], 'r', encoding='utf-8')
#text  = f.read()
#print text
#-------------------------------------------------------------------------------------------------------------------------------------------------

## Create a nested dictionary with the following levels:
    ## Corpus (100 transcripts), each identified by a village key
    ## Village (with geographic identifiers and projectstatus)
    ## Speech (gender, position, content, noise)


corpus = {} # Create top-level dictionary
village_list = [] ## For list of villages
speechids = [] # For list of speech names
word_list = [] # For list of unigrams
prop_nouns = [] # For list of propernouns

## List of proper nouns not to be excluded
removelist = ['PLF', 'JCB', 'VPRC', 'THAI', 'Free', 
    'Supreme Court', 'Labor', 'Republic', 'Fourth', 'Income', 
    'Family Welfare Department', 'School Education Department', 
    'Tally', 'Transparency', 'Tamil', 'Domestic Violence' 
    'Rape Kidnapping', 'Dowry', 'Sender', 'PETITION', 'Try',
    'Cooked', 'Action', 'Newspaper', 'Suitable', 'Womens', 
    'Supreme', 'High Court' 'Electricity Board', 'Thanks', 
    'Health', 'Agriculture', 'Family Welfare', 'District',
    'Commotion', 'Next' 'Rural Development', 'Child Marriage', 'Good',
    'Public', 'Prizes', 'Panchayat Raj', 'BDO', 'Rupees', 'CST'
    'Pudhu Vazhvu Project', 'Mental', 'National', 'Serial', 'Allotment',
    'Revenue Department', 'Electricity', 'Subject Basic', 'Group Loan',
    'Child', 'CLC', 'Road', 'CDF', 'Balanced', 'Female Child',
    'Scheme', 'Administrative', 'Tell', 'Power', 'Panchayat', 'Running',
    'Liquids', 'Groups', 'Gandhi', 'Economic', 'Honourable', 'Fifth',
    'Bank']


for i in range(1, len(transcripts)):
    ## Read in transcript as text
    #f = io.open(transcripts[i], 'r')#, encoding='utf-8')
    f = open(transcripts[i], 'r')
    text  = f.read()
    text = re.sub("\xe2\x80\x99", "", text)
    text = re.sub("\xe2\x80\xa6", "", text)
    text = re.sub("[\x80-\xff]", "", text)
    lines = text.splitlines()
    lines = [line for line in lines if line.strip()]

    #lines = [line.encode('ascii', 'replace') for line in lines]
    speeches = lines[4:(len(lines))]

    #--------------------------------------------------------------------

    # create dictionary for village
    village = lines[2].lower()
    corpus[village] = {} ## Create village-level dictionary


    # extract village-level ids
    corpus[village]['projectstatus'] = lines[3]
    corpus[village]['district'] = lines[0].lower()
    corpus[village]['block'] = lines[1].lower()
    
    #--------------------------------------------------------------------

    # Create a dictionary for each speech
    # and populate with speech data
    ids =[]
    for j in range(0, len(speeches)):
        ids.append("speech" + str(j))

    speechids.extend(ids)

    for k in range(0, len(speeches)):
        speechid = ids[k]
        corpus[village][speechid] = {}
        body = speeches[k]
        #body = re.sub("\xe2", "", body)
        #body = re.sub("\x80", "", body)
        #body = re.sub("\x99", "", body)
        #body = re.sub("\x9c", "", body)
        #body = re.sub("\x9d", "", body)
        #body = body.encode('ascii', 'replace')

        ## Define speaker by splitting before colon
        speaker = body.split(":", 1)[0]

        ## Define content by splitting after colon
        content = body.split(":", 1)[1]

        ## Raw Content
        corpus[village][speechid]['raw_content'] = content

        ## Define gender of speaker
        gender = speaker.split('(',1)[0]
        gender = gender.lower().strip()

        ## Define position of speaker; code as citizen if no official position is defined
        if len(speaker.split()) > 1:
            position = re.sub('\W', '', speaker.split("(", 1)[1])
        else:
            position = 'Citizen'
        position = position.lower().strip()

        ## Save to speech-level dictionary
        corpus[village][speechid]['gender'] = gender
        corpus[village][speechid]['position'] = position
    #----------------------------------------------------------------
        # CLEAN TEXT
        # replace dates
        content_1 = re.sub("(\d+)(\d+).(\d+)(\d+).(\d+)(\d+)(\d+)(\d+)", 
            "DATE", content)

        # remove apostrophe issue and rupee markers
        content_1 = re.sub('Rs.', 'Rupees ', content_1)

        ## remove punctuation
        exclude = set(string.punctuation)

        content_2 = ''.join(ch for ch in content_1 if ch not in exclude)

        # rupee amounts with spaces in between (due to removed commas)
        content_3 = re.sub("(\d+)\s+(\d+)", r"\1\2", content_2)

        ## Remove proper nounds
        tokens = nltk.tokenize.word_tokenize(content_3)
        pos = nltk.pos_tag(tokens)

        sentt = nltk.ne_chunk(pos, binary = False)
        
        nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in sentt if isinstance(ne, nltk.tree.Tree)]
        prop_nouns.extend(nes)
        prop_nouns = filter(lambda x: x not in removelist, prop_nouns)
        
        ## Tokenize
        content_4 = word_tokenize(content_3)

        ## Remove Proper Nouns
        content_5 = filter(lambda x: x not in prop_nouns, content_4)
        
        ## Total Word Count
        # corpus[village][speechid]['length'] = len(content_4)

        ## counts of key flag words
        commotion = content_5.count("commotion")
        murmurs = content_5.count("murmur")
        noise = commotion + murmurs
        corpus[village][speechid]['noise'] = noise

        ## count remaining numeric claims
        corpus[village][speechid]['numbers'] = sum(c.isdigit() for c in content_5)

        ## Remove all digits
        content_6 = [x for x in content_5 if not (x.isdigit() 
                       or x[0] == '-' and x[1:].isdigit())]

        ## Make Lower Case
        content_7 = [x.lower() for x in content_6]

        # REMOVE STOPWORDS
        content_8 = filter(lambda x: x not in stopwords.words('english'), content_7)
        
        content_9 = ' '.join(content_8)
        corpus[village][speechid]['cleaned_content'] = content_9

        print village, speechid



#########################################################################
## Pickle nested dictionary
wd = "/Users/rmparthasarathy/Dropbox/wb-pvp/GS_Stata/Transcript_Analysis/"
os.chdir(wd)

filename = "transcript_corpus_v1.pkl"
fileobj = open(filename, 'wb')
pickle.dump(corpus, fileobj)
fileobj.close()

#########################################################################
#########################################################################
#########################################################################
#########################################################################

